In [88]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [115]:
train = pd.read_csv('train.csv')
y = train['Survived']
del train['Survived']
train.head()


Out[115]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [57]:
train.shape


Out[57]:
(891, 11)

In [116]:
test = pd.read_csv('test.csv')
test.head()


Out[116]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [59]:
test.shape


Out[59]:
(418, 11)

Проведем краткое исследование данных, построим бейзлайн, результаты которого попытаемся улучшить


In [129]:
data = pd.concat([train, test], ignore_index = True)
data.head()


Out[129]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [64]:
data.describe()


Out[64]:
PassengerId Pclass Age SibSp Parch Fare
count 1309.000000 1309.000000 1046.000000 1309.000000 1309.000000 1308.000000
mean 655.000000 2.294882 29.881138 0.498854 0.385027 33.295479
std 378.020061 0.837836 14.413493 1.041658 0.865560 51.758668
min 1.000000 1.000000 0.170000 0.000000 0.000000 0.000000
25% 328.000000 2.000000 21.000000 0.000000 0.000000 7.895800
50% 655.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 982.000000 3.000000 39.000000 1.000000 0.000000 31.275000
max 1309.000000 3.000000 80.000000 8.000000 9.000000 512.329200

In [65]:
data.isnull().sum()


Out[65]:
PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

Пропущенные данные в графе возраст заменим медианой, данные в графе с номером кабины практически отсутствуют, удалим его, пропущенные данные в графе порт посадки заменим на наиболее часто встречающиеся значение


In [130]:
data['Age'] = data['Age'].fillna(np.median(data['Age'].loc[(data['Age'].isnull()== False)]))

In [131]:
data['Embarked'].value_counts()


Out[131]:
S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [132]:
data['Embarked'] = data['Embarked'].fillna('S')
del data['Cabin']

In [133]:
data.Pclass.loc[data['Fare'].isnull()]


Out[133]:
1043    3
Name: Pclass, dtype: int64

Пасажир для которого пропущено значение цены его билета был в третьем классе, заполним это значение медианой цен на билеты в третьем классе


In [134]:
data['Fare'] = data['Fare'].fillna(np.median(data.Fare[data.Pclass == 3][np.isnan(data.Fare)==False]))

In [71]:
data.isnull().sum()


Out[71]:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

Закодируем категориальные перменные, исследуем признак 'Name' и получим датасет для получения базовой линии, которую мы будем пытаться превзойти


In [72]:
f, ax = plt.subplots(figsize=(8, 8))
sns.barplot(
    ax=ax,
    x='Pclass',
    y=y,
    hue='Sex',
    data=train,
    capsize=0.05
)
ax.set_title("Survival By Gender and Ticket Class")
ax.set_ylabel("Survival (%)")
ax.set_xlabel("")
ax.set_xticklabels(["First Class", "Second Class", "Third Class"])
plt.show()


В первую очердь на спасали женьщин, также спаслась большая доля мужчин из первого класса (скорее всего обеспеченных)


In [37]:
train.Name


Out[37]:
0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Williams, Mr. Charles Eugene
18     Vander Planke, Mrs. Julius (Emelia Maria Vande...
19                               Masselmani, Mrs. Fatima
20                                  Fynney, Mr. Joseph J
21                                 Beesley, Mr. Lawrence
22                           McGowan, Miss. Anna "Annie"
23                          Sloper, Mr. William Thompson
24                         Palsson, Miss. Torborg Danira
25     Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
26                               Emir, Mr. Farred Chehab
27                        Fortune, Mr. Charles Alexander
28                         O'Dwyer, Miss. Ellen "Nellie"
29                                   Todoroff, Mr. Lalio
                             ...                        
861                          Giles, Mr. Frederick Edward
862    Swift, Mrs. Frederick Joel (Margaret Welles Ba...
863                    Sage, Miss. Dorothy Edith "Dolly"
864                               Gill, Mr. John William
865                             Bystrom, Mrs. (Karolina)
866                         Duran y More, Miss. Asuncion
867                 Roebling, Mr. Washington Augustus II
868                          van Melkebeke, Mr. Philemon
869                      Johnson, Master. Harold Theodor
870                                    Balkic, Mr. Cerin
871     Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
872                             Carlsson, Mr. Frans Olof
873                          Vander Cruyssen, Mr. Victor
874                Abelson, Mrs. Samuel (Hannah Wizosky)
875                     Najib, Miss. Adele Kiamie "Jane"
876                        Gustafsson, Mr. Alfred Ossian
877                                 Petroff, Mr. Nedelio
878                                   Laleff, Mr. Kristo
879        Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
880         Shelley, Mrs. William (Imanita Parrish Hall)
881                                   Markun, Mr. Johann
882                         Dahlberg, Miss. Gerda Ulrika
883                        Banfield, Mr. Frederick James
884                               Sutehall, Mr. Henry Jr
885                 Rice, Mrs. William (Margaret Norton)
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

Длина имени во времена Титаника часто указывала на высокое происхождение, что часто связано с достатком, а так как достаток влияет на вероятность выжить, то создадим новый признак - число символов в имени


In [135]:
data['name_len'] = data['Name'].apply(len)
data['name_len']


Out[135]:
0       23
1       51
2       22
3       44
4       24
5       16
6       23
7       30
8       49
9       35
10      31
11      24
12      30
13      27
14      36
15      32
16      20
17      28
18      55
19      23
20      20
21      21
22      27
23      28
24      29
25      57
26      23
27      30
28      29
29      19
        ..
1279    20
1280    27
1281    26
1282    46
1283    29
1284    20
1285    24
1286    46
1287    20
1288    63
1289    30
1290    24
1291    23
1292    15
1293    30
1294    22
1295    28
1296    44
1297    25
1298    26
1299    31
1300    25
1301    22
1302    47
1303    30
1304    18
1305    28
1306    28
1307    19
1308    24
Name: name_len, Length: 1309, dtype: int64

In [136]:
label_enc = LabelEncoder()
data.Sex = label_enc.fit_transform(data.Sex)

In [137]:
data.Embarked = label_enc.fit_transform(data.Embarked)

удалим данные которые не будем использовать на данном этапк: PassengerId, Name, Ticket


In [138]:
train_2 = data.ix[0:890]
del train_2['Name']
del train_2['Ticket']
train_2.head()


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.
Out[138]:
PassengerId Pclass Sex Age SibSp Parch Fare Embarked name_len
0 1 3 1 22.0 1 0 7.2500 2 23
1 2 1 0 38.0 1 0 71.2833 0 51
2 3 3 0 26.0 0 0 7.9250 2 22
3 4 1 0 35.0 1 0 53.1000 2 44
4 5 3 1 35.0 0 0 8.0500 2 24

In [140]:
test_2 = data.ix[891:]
del test_2['Name']
del test_2['Ticket']


Out[140]:
PassengerId Pclass Sex Age SibSp Parch Fare Embarked name_len
0 1 3 1 22.0 1 0 7.2500 2 23
1 2 1 0 38.0 1 0 71.2833 0 51
2 3 3 0 26.0 0 0 7.9250 2 22
3 4 1 0 35.0 1 0 53.1000 2 44
4 5 3 1 35.0 0 0 8.0500 2 24
5 6 3 1 28.0 0 0 8.4583 1 16
6 7 1 1 54.0 0 0 51.8625 2 23
7 8 3 1 2.0 3 1 21.0750 2 30
8 9 3 0 27.0 0 2 11.1333 2 49
9 10 2 0 14.0 1 0 30.0708 0 35
10 11 3 0 4.0 1 1 16.7000 2 31
11 12 1 0 58.0 0 0 26.5500 2 24
12 13 3 1 20.0 0 0 8.0500 2 30
13 14 3 1 39.0 1 5 31.2750 2 27
14 15 3 0 14.0 0 0 7.8542 2 36
15 16 2 0 55.0 0 0 16.0000 2 32
16 17 3 1 2.0 4 1 29.1250 1 20
17 18 2 1 28.0 0 0 13.0000 2 28
18 19 3 0 31.0 1 0 18.0000 2 55
19 20 3 0 28.0 0 0 7.2250 0 23
20 21 2 1 35.0 0 0 26.0000 2 20
21 22 2 1 34.0 0 0 13.0000 2 21
22 23 3 0 15.0 0 0 8.0292 1 27
23 24 1 1 28.0 0 0 35.5000 2 28
24 25 3 0 8.0 3 1 21.0750 2 29
25 26 3 0 38.0 1 5 31.3875 2 57
26 27 3 1 28.0 0 0 7.2250 0 23
27 28 1 1 19.0 3 2 263.0000 2 30
28 29 3 0 28.0 0 0 7.8792 1 29
29 30 3 1 28.0 0 0 7.8958 2 19
... ... ... ... ... ... ... ... ... ...
861 862 2 1 21.0 1 0 11.5000 2 27
862 863 1 0 48.0 0 0 25.9292 2 51
863 864 3 0 28.0 8 2 69.5500 2 33
864 865 2 1 24.0 0 0 13.0000 2 22
865 866 2 0 42.0 0 0 13.0000 2 24
866 867 2 0 27.0 1 0 13.8583 0 28
867 868 1 1 31.0 0 0 50.4958 2 36
868 869 3 1 28.0 0 0 9.5000 2 27
869 870 3 1 4.0 1 1 11.1333 2 31
870 871 3 1 26.0 0 0 7.8958 2 17
871 872 1 0 47.0 1 1 52.5542 2 48
872 873 1 1 33.0 0 0 5.0000 2 24
873 874 3 1 47.0 0 0 9.0000 2 27
874 875 2 0 28.0 1 0 24.0000 0 37
875 876 3 0 15.0 0 0 7.2250 0 32
876 877 3 1 20.0 0 0 9.8458 2 29
877 878 3 1 19.0 0 0 7.8958 2 20
878 879 3 1 28.0 0 0 7.8958 2 18
879 880 1 0 56.0 0 1 83.1583 0 45
880 881 2 0 25.0 0 1 26.0000 2 44
881 882 3 1 33.0 0 0 7.8958 2 18
882 883 3 0 22.0 0 0 10.5167 2 28
883 884 2 1 28.0 0 0 10.5000 2 29
884 885 3 1 25.0 0 0 7.0500 2 22
885 886 3 0 39.0 0 5 29.1250 1 36
886 887 2 1 27.0 0 0 13.0000 2 21
887 888 1 0 19.0 0 0 30.0000 2 28
888 889 3 0 28.0 1 2 23.4500 2 40
889 890 1 1 26.0 0 0 30.0000 0 21
890 891 3 1 32.0 0 0 7.7500 1 19

891 rows × 9 columns


In [141]:
test_2


Out[141]:
PassengerId Pclass Sex Age SibSp Parch Fare Embarked name_len
891 892 3 1 34.5 0 0 7.8292 1 16
892 893 3 0 47.0 1 0 7.0000 2 32
893 894 2 1 62.0 0 0 9.6875 1 25
894 895 3 1 27.0 0 0 8.6625 2 16
895 896 3 0 22.0 1 1 12.2875 2 44
896 897 3 1 14.0 0 0 9.2250 2 26
897 898 3 0 30.0 0 0 7.6292 1 20
898 899 2 1 26.0 1 1 29.0000 2 28
899 900 3 0 18.0 0 0 7.2292 0 41
900 901 3 1 21.0 2 0 24.1500 2 23
901 902 3 1 28.0 0 0 7.8958 2 16
902 903 1 1 46.0 0 0 26.0000 2 26
903 904 1 0 23.0 1 0 82.2667 2 45
904 905 2 1 63.0 1 0 26.0000 2 20
905 906 1 0 47.0 1 0 61.1750 2 55
906 907 2 0 24.0 1 0 27.7208 0 45
907 908 2 1 35.0 0 0 12.3500 1 17
908 909 3 1 21.0 0 0 7.2250 0 17
909 910 3 0 27.0 1 0 7.9250 2 28
910 911 3 0 45.0 0 0 7.2250 0 37
911 912 1 1 55.0 1 0 59.4000 0 22
912 913 3 1 9.0 0 1 3.1708 2 25
913 914 1 0 28.0 0 0 31.6833 2 36
914 915 1 1 21.0 0 1 61.3792 0 31
915 916 1 0 48.0 1 3 262.3750 0 47
916 917 3 1 50.0 1 0 14.5000 2 23
917 918 1 0 22.0 0 1 61.9792 0 28
918 919 3 1 22.5 0 0 7.2250 0 17
919 920 1 1 41.0 0 0 30.5000 2 23
920 921 3 1 28.0 2 0 21.6792 0 17
... ... ... ... ... ... ... ... ... ...
1279 1280 3 1 21.0 0 0 7.7500 1 20
1280 1281 3 1 6.0 3 1 21.0750 2 27
1281 1282 1 1 23.0 0 0 93.5000 2 26
1282 1283 1 0 51.0 0 1 39.4000 2 46
1283 1284 3 1 13.0 0 2 20.2500 2 29
1284 1285 2 1 47.0 0 0 10.5000 2 20
1285 1286 3 1 29.0 3 1 22.0250 2 24
1286 1287 1 0 18.0 1 0 60.0000 2 46
1287 1288 3 1 24.0 0 0 7.2500 1 20
1288 1289 1 0 48.0 1 1 79.2000 0 63
1289 1290 3 1 22.0 0 0 7.7750 2 30
1290 1291 3 1 31.0 0 0 7.7333 1 24
1291 1292 1 0 30.0 0 0 164.8667 2 23
1292 1293 2 1 38.0 1 0 21.0000 2 15
1293 1294 1 0 22.0 0 1 59.4000 0 30
1294 1295 1 1 17.0 0 0 47.1000 2 22
1295 1296 1 1 43.0 1 0 27.7208 0 28
1296 1297 2 1 20.0 0 0 13.8625 0 44
1297 1298 2 1 23.0 1 0 10.5000 2 25
1298 1299 1 1 50.0 1 1 211.5000 0 26
1299 1300 3 0 28.0 0 0 7.7208 1 31
1300 1301 3 0 3.0 1 1 13.7750 2 25
1301 1302 3 0 28.0 0 0 7.7500 1 22
1302 1303 1 0 37.0 1 0 90.0000 1 47
1303 1304 3 0 28.0 0 0 7.7750 2 30
1304 1305 3 1 28.0 0 0 8.0500 2 18
1305 1306 1 0 39.0 0 0 108.9000 0 28
1306 1307 3 1 38.5 0 0 7.2500 2 28
1307 1308 3 1 28.0 0 0 8.0500 2 19
1308 1309 3 1 28.0 1 1 22.3583 0 24

418 rows × 9 columns

Построение базовой линии

Можно предположить, что давольно хороший результат нам даст рандомфорест, используем его для построения базовый линии


In [142]:
X_train, X_test, y_train, y_test = train_test_split(train_2, y, test_size=0.25, random_state=42)

In [96]:
forest = RandomForestClassifier()
params = {
    'n_estimators':[100,300,500,700,900],
    'criterion': ('gini','entropy'),
    'max_features': (5,6,7,8,9,'auto', 'log2'),
    'max_depth': (None, 50,40,30,20,10),
    
}
forest_grid = GridSearchCV(forest, params)
forest_grid.fit(X_train, y_train)


Out[96]:
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [100, 300, 500, 700, 900], 'criterion': ('gini', 'entropy'), 'max_features': (5, 6, 7, 8, 9, 'auto', 'log2'), 'max_depth': (None, 50, 40, 30, 20, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [97]:
forest_grid.best_estimator_


Out[97]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=50, max_features=9, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [98]:
forest_grid.best_score_


Out[98]:
0.82185628742514971

In [99]:
y_pred = forest_grid.best_estimator_.predict(X_test)

In [100]:
accuracy_score(y_test, y_pred)


Out[100]:
0.82959641255605376

переобучим лучший классификатора на полном объеме данных


In [143]:
best_forest = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=50, max_features=9, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
best_forest.fit(train_2,y)


Out[143]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=50, max_features=9, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [144]:
y_pred = best_forest.predict(test_2)

In [ ]:


In [148]:
sub_samp = pd.read_csv('gender_submission.csv')
sub_samp


Out[148]:
PassengerId Survived
0 892 0
1 893 1
2 894 0
3 895 0
4 896 1
5 897 0
6 898 1
7 899 0
8 900 1
9 901 0
10 902 0
11 903 0
12 904 1
13 905 0
14 906 1
15 907 1
16 908 0
17 909 0
18 910 1
19 911 1
20 912 0
21 913 0
22 914 1
23 915 0
24 916 1
25 917 0
26 918 1
27 919 0
28 920 0
29 921 0
... ... ...
388 1280 0
389 1281 0
390 1282 0
391 1283 1
392 1284 0
393 1285 0
394 1286 0
395 1287 1
396 1288 0
397 1289 1
398 1290 0
399 1291 0
400 1292 1
401 1293 0
402 1294 1
403 1295 0
404 1296 0
405 1297 0
406 1298 0
407 1299 0
408 1300 1
409 1301 1
410 1302 1
411 1303 1
412 1304 1
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns


In [149]:
sub_samp.Survived = y_pred
sub_samp


Out[149]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 0
5 897 0
6 898 0
7 899 0
8 900 1
9 901 0
10 902 0
11 903 0
12 904 1
13 905 0
14 906 1
15 907 1
16 908 0
17 909 0
18 910 0
19 911 0
20 912 0
21 913 1
22 914 1
23 915 0
24 916 1
25 917 0
26 918 1
27 919 0
28 920 1
29 921 0
... ... ...
388 1280 0
389 1281 0
390 1282 1
391 1283 1
392 1284 0
393 1285 0
394 1286 0
395 1287 1
396 1288 0
397 1289 1
398 1290 0
399 1291 0
400 1292 1
401 1293 0
402 1294 1
403 1295 1
404 1296 1
405 1297 1
406 1298 0
407 1299 0
408 1300 1
409 1301 1
410 1302 1
411 1303 1
412 1304 0
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns


In [153]:
sub_samp.to_csv('baseline_sub.csv', index = False) # 0.76077

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [29]:
f, ax = plt.subplots(figsize=(8, 8))
sns.barplot(
    ax=ax,
    x='Embarked',
    y='Survived',
    hue='Sex',
    data=train,
    capsize=0.05
)
ax.set_title("Survival by port of Embarction ")
ax.set_ylabel("Survival (%)")
ax.set_xlabel("")
ax.set_xticklabels(["C", "Q", "S"])
plt.show()


Выживших из Queenstown заметно больше. Сравним распределния стоимости билетов в этих городах.


In [49]:
f, ax = plt.subplots(figsize=(8, 8))
sns.barplot(
    ax=ax,
    x='Embarked',
    y='Survived',
    hue='Pclass',
    data=train,
    capsize=0.05
)
ax.set_title("Survival by port of Embarction and class of ticket")
ax.set_ylabel("Survival (%)")
ax.set_xlabel("")
ax.set_xticklabels(["C", "Q", "S"])
plt.show()



In [65]:
print ('1st class passengers from Cherbourg:'+ str(len(train['Fare'][train['Embarked']=='C'][train['Pclass']==1])))
print ('1st class passengers from Queenstown :'+ str(len(train['Fare'][train['Embarked']=='Q'][train['Pclass']==1])))
print ('1st class passengers from Southampton :'+ str(len(train['Fare'][train['Embarked']=='S'][train['Pclass']==1])))
print ('2nd class passengers from Cherbourg:'+ str(len(train['Fare'][train['Embarked']=='C'][train['Pclass']==2])))
print ('2nd class passengers from Queenstown :'+ str(len(train['Fare'][train['Embarked']=='Q'][train['Pclass']==2])))
print ('2nd class passengers from Southampton :'+ str(len(train['Fare'][train['Embarked']=='S'][train['Pclass']==2])))


1st class passengers from Cherbourg:85
1st class passengers from Queenstown :2
1st class passengers from Southampton :129
2nd class passengers from Cherbourg:17
2nd class passengers from Queenstown :3
2nd class passengers from Southampton :164

Закодируем категориальные признаки


In [25]:
train.columns


Out[25]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

Кодировать графу имя бессымысленно, однако в списке


In [ ]:
train_cat = train[['Pclass']]